Passengers Data

Load data

dat <- import(here("data", "dat.csv")) %>% 
   clean_names() %>% 
   mutate_all(na_if,"")

Clean data

dat$gender <- as.factor(dat$gender)
dat$marital_status <- as.factor(dat$marital_status)
dat$category <- as.factor(dat$category)
dat$class <- as.factor(dat$class)
dat$status <- as.factor(dat$survived)
dat$embarked <- as.factor(dat$embarked)
dat$disembarked  <- as.factor(dat$disembarked)

dat <- dat %>% 
 mutate(nationality2 = case_when(nationality == "English" ~ "English",
   nationality == "Irish" ~ "Irish",
   nationality == "American" ~ "American",
   nationality == "Swedish" ~ "Swedish",
   nationality == "Finnish" ~ "Finnish",
   nationality == "Scottish" ~ "Scottish",
   nationality == "French" ~ "French",
   nationality == "Italian" ~ "Italian",
   nationality == "Canadian" ~ "Canadian",
   nationality == "Bulgarian" ~ "Bulgarian",
   nationality == "Croatian" ~ "Croatian",
   nationality == "Belgian" ~ "Belgian",
   nationality == "Norwegian" ~ "Norwegian",
   nationality == "Channel Islander" ~ "Channel Islander",
   nationality == "Welsh" ~ "Welsh",
   nationality == "Swiss" ~ "Swiss",
   nationality == "German" ~ "German",
   nationality == "Danish" ~ "Danish",
   nationality == "Spanish" ~ "Spanish",
   nationality == "Australian" ~ "Australian",
   nationality == "Polish" ~ "Polish",
   nationality == "South African" ~ "South African",
   nationality == "Bosnian" ~ "Bosnian",
   nationality == "Hong Kongese" ~ "Hong Kongese",
   nationality == "Dutch" ~ "Dutch",
   nationality == "Lithuanian" ~ "Lithuanian",
   nationality == "Greek" ~ "Greek",
   nationality == "Portuguese" ~ "Portuguese",
   nationality == "Uruguayan" ~ "Uruguayan",
   nationality == "Chinese" ~ "Chinese",
   nationality == "Slovenian" ~ "Slovenian",
   nationality == "Cape Verdean" ~ "Cape Verdean",
   nationality == "Egyptian" ~ "Egyptian",
   nationality == "Japanese" ~ "Japanese",
   nationality == "Hungarian" ~ "Hungarian",
   nationality == "Bosnian" ~ "Bosnian",
   nationality == "Hong Kongese" ~ "Hong Kongese",
   nationality == "Latvian" ~ "Latvian",
   nationality == "Austrian" ~ "Austrian",
   nationality == "Greek" ~ "Greek",
   nationality == "Mexican" ~ "Mexican",
   nationality == "Sweden" ~ "Sweedish",
   nationality == "Turkish" ~ "Turkish",
   nationality == "Slovenian" ~ "Slovenian",
   nationality == "Guyanese" ~ "Guyanese",
   nationality == "Haitian" ~ "Haitian",
   nationality == "Syrian,Lebanese" ~ "Syrian/Lebanese",
   nationality == "Unknown" ~ "Unknown",
   TRUE ~ "Other - Multiple", ))

dat <- dat %>% 
   mutate(nationality2 = ifelse(nationality2 == "Unknown", NA, nationality2))

Descriptives

# Breakdown of passengers by class and gender
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(gender)) %>% 
   group_by(class, gender) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100)
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
## # A tibble: 6 x 4
## # Groups:   class [3]
##   class     gender count percent
##   <fct>     <fct>  <int>   <dbl>
## 1 1st Class Female   153    43.7
## 2 1st Class Male     197    56.3
## 3 2nd Class Female   112    38.4
## 4 2nd Class Male     180    61.6
## 5 3rd Class Female   216    30.5
## 6 3rd Class Male     493    69.5
# Breakdown of passenger nationalities
dat %>% 
   filter(!is.na(nationality2)) %>% 
   group_by(nationality2) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(desc(percent))
## # A tibble: 44 x 3
##    nationality2     count percent
##    <chr>            <int>   <dbl>
##  1 English           1037   42.4 
##  2 Irish              361   14.7 
##  3 American           246   10.0 
##  4 Other - Multiple   116    4.74
##  5 Swedish             99    4.04
##  6 Syrian/Lebanese     86    3.51
##  7 Finnish             58    2.37
##  8 Scottish            49    2.00
##  9 French              44    1.80
## 10 Italian             41    1.67
## # ... with 34 more rows
# Breakdown of passenger nationalities by class (all)
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(nationality2)) %>% 
   group_by(class, nationality2) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(class, desc(percent))
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
## # A tibble: 79 x 4
## # Groups:   class [3]
##    class     nationality2     count percent
##    <fct>     <chr>            <int>   <dbl>
##  1 1st Class American           195   57.4 
##  2 1st Class English             51   15   
##  3 1st Class Canadian            27    7.94
##  4 1st Class Other - Multiple    14    4.12
##  5 1st Class French              10    2.94
##  6 1st Class Irish                6    1.76
##  7 1st Class Swiss                6    1.76
##  8 1st Class German               5    1.47
##  9 1st Class Scottish             5    1.47
## 10 1st Class Spanish              4    1.18
## # ... with 69 more rows
# Breakdown of passenger nationalities by class (>= 5%)
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(nationality2)) %>% 
   group_by(class, nationality2) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   filter(percent >= 5) %>% 
   arrange(class, desc(percent))
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
## # A tibble: 12 x 4
## # Groups:   class [3]
##    class     nationality2     count percent
##    <fct>     <chr>            <int>   <dbl>
##  1 1st Class American           195   57.4 
##  2 1st Class English             51   15   
##  3 1st Class Canadian            27    7.94
##  4 2nd Class English            145   51.1 
##  5 2nd Class Other - Multiple    25    8.80
##  6 2nd Class American            24    8.45
##  7 3rd Class English            112   15.8 
##  8 3rd Class Irish              105   14.8 
##  9 3rd Class Swedish             89   12.6 
## 10 3rd Class Syrian/Lebanese     83   11.7 
## 11 3rd Class Other - Multiple    69    9.73
## 12 3rd Class Finnish             52    7.33
# Average age by class
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   group_by(class) %>% 
   summarize(avg_age = mean(age), min_age = min(age), max_age = max(age))
## # A tibble: 3 x 4
##   class     avg_age min_age max_age
##   <fct>       <dbl>   <int>   <int>
## 1 1st Class    39.1       0      71
## 2 2nd Class    30.0       0      71
## 3 3rd Class    25.1       0      74
# Survival rate by class
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(survived)) %>% 
   group_by(class, survived) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(class, survived)
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
## # A tibble: 6 x 4
## # Groups:   class [3]
##   class     survived count percent
##   <fct>     <chr>    <int>   <dbl>
## 1 1st Class Lost       123    38.0
## 2 1st Class Saved      201    62.0
## 3 2nd Class Lost       166    58.5
## 4 2nd Class Saved      118    41.5
## 5 3rd Class Lost       528    74.5
## 6 3rd Class Saved      181    25.5
# Survival rate by gender
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(survived)) %>% 
   group_by(gender, survived) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(gender, survived)
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
## # A tibble: 4 x 4
## # Groups:   gender [2]
##   gender survived count percent
##   <fct>  <chr>    <int>   <dbl>
## 1 Female Lost       127    27.3
## 2 Female Saved      339    72.7
## 3 Male   Lost       690    81.1
## 4 Male   Saved      161    18.9
# Survival rate by class and gender
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(survived)) %>% 
   group_by(class, gender, survived) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(class, gender)
## `summarise()` has grouped output by 'class', 'gender'. You can override using the `.groups` argument.
## # A tibble: 12 x 5
## # Groups:   class, gender [6]
##    class     gender survived count percent
##    <fct>     <fct>  <chr>    <int>   <dbl>
##  1 1st Class Female Lost         5    3.47
##  2 1st Class Female Saved      139   96.5 
##  3 1st Class Male   Lost       118   65.6 
##  4 1st Class Male   Saved       62   34.4 
##  5 2nd Class Female Lost        12   11.3 
##  6 2nd Class Female Saved       94   88.7 
##  7 2nd Class Male   Lost       154   86.5 
##  8 2nd Class Male   Saved       24   13.5 
##  9 3rd Class Female Lost       110   50.9 
## 10 3rd Class Female Saved      106   49.1 
## 11 3rd Class Male   Lost       418   84.8 
## 12 3rd Class Male   Saved       75   15.2

Density ridges

surv_classhist <- dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   filter(!is.na(survived)) %>% 
   ggplot(aes(age, class)) +
   geom_density_ridges(aes(fill = factor(survived))) +
   labs(title = "Age Distribution of Survival Status By Class", 
   x = "Age Distribution", y = "Passenger Class") +
   theme_minimal() +
   theme(plot.title = element_text(hjust = 0.5))

surv_classhist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 3.69

surv_agehist <- dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   filter(!is.na(survived)) %>% 
   ggplot(aes(age, gender)) +
   geom_density_ridges(aes(fill = factor(survived))) +
   labs(title = "Age Distribution of Survival Status By Gender", 
   x = "Age Distribution", y = "Passenger Gender") +
   theme_minimal() +
   theme(plot.title = element_text(hjust = 0.5))

surv_agehist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 3.88

surv_ageclass_hist <- dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   filter(!is.na(survived)) %>% 
   ggplot(aes(age, gender)) +
   facet_wrap(~class, nrow=3) +
   geom_density_ridges(aes(fill = factor(survived))) +
   labs(title = "Age Distribution of Survival Status By Class and Gender", 
   x = "Age Distribution", y = "Passenger Gender") +
   theme_minimal() +
   theme(plot.title = element_text(hjust = 0.5))

surv_ageclass_hist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 6.54
## Picking joint bandwidth of 5.57
## Picking joint bandwidth of 2.96

Fares

Load data

fares <- import(here("data", "avgfare.csv")) %>% 
   clean_names()

Calculate inflation

p1921 <- (17.9/9.7)
fares$fare_1921 <- p1921*fares$fare_1912
fares$fare_1921 <- round(fares$fare_1921, 2)

p1931 <- (15.2/9.7)
fares$fare_1931 <- p1931*fares$fare_1912
fares$fare_1931 <- round(fares$fare_1931, 2)

p1941 <- (14.7/9.7)
fares$fare_1941 <- p1941*fares$fare_1912
fares$fare_1941 <- round(fares$fare_1941, 2)

p1951 <- (26.0/9.7)
fares$fare_1951 <- p1951*fares$fare_1912
fares$fare_1951 <- round(fares$fare_1951, 2)

p1961 <- (29.9/9.7)
fares$fare_1961 <- p1961*fares$fare_1912
fares$fare_1961 <- round(fares$fare_1961, 2)

p1971 <- (40.5/9.7)
fares$fare_1971 <- p1971*fares$fare_1912
fares$fare_1971 <- round(fares$fare_1971, 2)

p1981 <- (90.9/9.7)
fares$fare_1981 <- p1981*fares$fare_1912
fares$fare_1981 <- round(fares$fare_1981, 2)

p1991 <- (136.2/9.7)
fares$fare_1991 <- p1991*fares$fare_1912
fares$fare_1991 <- round(fares$fare_1991, 2)

p2001 <- (177.1/9.7)
fares$fare_2001 <- p2001*fares$fare_1912
fares$fare_2001 <- round(fares$fare_2001, 2)

p2011 <- (224.9/9.7)
fares$fare_2011 <- p2011*fares$fare_1912
fares$fare_2011 <- round(fares$fare_2011, 2)

p2021 <- (274.3/9.7)
fares$fare_2021 <- p2021*fares$fare_1912
fares$fare_2021 <- round(fares$fare_2021, 2)

Reshape fare data

faredat <- gather(fares, year, price, fare_1912:fare_2021)

faredat <- faredat %>% 
   mutate(year = case_when(year == "fare_1912" ~ 1912,
   year == "fare_1921" ~ 1921,
   year == "fare_1931" ~ 1931,
   year == "fare_1941" ~ 1941,
   year == "fare_1951" ~ 1951,
   year == "fare_1961" ~ 1961,
   year == "fare_1971" ~ 1971,
   year == "fare_1981" ~ 1981,
   year == "fare_1991" ~ 1991,
   year == "fare_2001" ~ 2001,
   year == "fare_2011" ~ 2011,
   year == "fare_2021" ~ 2021, ))

faredat$accommodation <- as.factor(faredat$accommodation)

faredat$accomodation <- factor(faredat$accommodation, levels = c("First-class parlor suite", "First-class cabin", "Second-class cabin", "Third-class cabin"))

Plot fare inflation

fare_graph <- faredat %>% 
   ggplot(aes(year, price, colour=accommodation)) +
   geom_line() +
   geom_point() +
   scale_colour_brewer(palette="Spectral") +
   facet_wrap(~ accommodation, 4, scales = "free") +
   xlim(1912,2021) +
   theme(panel.spacing = unit(1, "lines")) +
   theme_minimal()

fare_graph

ggplotly(fare_graph)